#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
爬取国家统计局最新地址库
省市区三级（Json版本）
author: xiexie1993
time: 2020-07-22
"""

import requests
from bs4 import BeautifulSoup
import json
import os
import datetime


def get_province(index_href):
    """抓取省份信息"""
    # 创建字典
    json_data = {}
    province_url = url + index_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(province_url, headers=headers)
    #  print('[Debug] request=', request)
    request.encoding = 'gbk'
    province_html_text = str(request.text)
    #  print('[Debug] province_html_text=', province_html_text)
    soup = BeautifulSoup(province_html_text, "html.parser")
    #  print('[Debug] soup=', soup)
    province_tr_list = soup.select('.provincetr a')
    #  print('[Debug] province_tr_list=', province_tr_list)
    #  province_list = {}
    province_lists = []
    province_key = 0
    nowTime = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    filename = "province" + str(nowTime) + ".json"
    print('[Debug] filename=',filename)
    file = open(filename, 'a+', encoding='utf-8')
    # 遍历省份列表信息
    for province_tr in province_tr_list:
        if province_tr:
            province_href = province_tr.attrs['href']
            #  print('[Debug] province_href=', province_href)
            province_no = province_href.split('.')[0]
            #  print('[Debug] province_no=', province_no)
            province_code = province_no + '0000'
            #  print('[Debug] province_code=', province_code)
            province_name = province_tr.text
            #  print('[Debug] province_name=', province_name)
            #  province_info = {'code': province_code, 'name': province_name}
            #  province_info = {'text': province_name, 'value': province_code}
            province_info = province_code + "\n"
            province_key += 1
            print("[Debug] 开始抓取第%s个省:%s,编码：%s" %(province_key,province_name,province_code))
            file.write(province_info)

    print('[INFO]抓取省份下市信息结束！')
    file.close()
    print('[INFO]数据写入完成！')


# 程序主入口
if __name__ == "__main__":
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
    # 创建json目录
    #  json_folder = 'json2019/'
    #  if not os.path.exists(json_folder):
    #      os.makedirs(json_folder)
    print('[INFO]开始…')
    get_province('index.html')
